#!/usr/bin/env python2
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2018, Dale Furrow dkfurrow@gmail.com'
'''
chron.com
'''
from datetime import datetime, timedelta
from lxml import html, etree
from StringIO import StringIO
from calibre.web.feeds.recipes import BasicNewsRecipe
import urllib2
from collections import OrderedDict
import calendar
from calibre.ebooks.BeautifulSoup import Tag


contributors_url = "https://www.bloomberg.com/view/contributors"
output_date_format = "%d %b, %H:%M"

hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}


def get_article_parsed(this_url):
    req = urllib2.Request(this_url, headers=hdr)
    page = urllib2.urlopen(req)
    content = page.read()
    parser = etree.HTMLParser()
    parsed = html.parse(StringIO(content), parser)
    return parsed


class BloombergContributor:
    _name = None
    _url_name = None
    _url_code = None
    _article_list = None  # article is title, link, date, description
    date_formats = ["%B %d, %Y %I:%M %p", "%b %d, %Y %I:%M %p"]

    def __init__(self, name, url_name, url_code):
        self._name = name
        self._url_name = url_name
        self._url_code = url_code
        self._article_list = []

    def __str__(self):
        return "{0} ({1}): {2:d} articles".format(self._name, self._url_name, len(self._article_list))

    def populate_article_list(self):
        list_url = "{0}/{1}/{2}/articles".format(
            contributors_url, self._url_code, self._url_name)
        parsed_list = get_article_parsed(list_url)
        articles = parsed_list.xpath("//li[contains(@class, 'item_lwCjl')]")
        for article in articles:
            headline = article.find('a')
            link = headline.attrib['href']
            title = headline.text.strip()
            article_date_eles = article.xpath(
                ".//span[contains(@class, 'time_3qQJR')]")
            if len(article_date_eles) > 0:
                article_date_str = article_date_eles[0].text.strip()
                article_date = self.parse_date_str(article_date_str)
            else:
                article_date = None
            summary_eles = article.xpath(
                ".//p[contains(@class, 'summary_17SO6')]")
            if len(summary_eles) > 0:
                summary = summary_eles[0].text.strip()
            else:
                summary = "No summary..."
            self._article_list.append((title.encode('ascii', 'ignore'), link, summary.encode('ascii', 'ignore'),
                                      article_date, self.get_article_timestamp(article_date)))

    @staticmethod
    def get_article_timestamp(article_date):
        # assume all times Eastern...
        # 2nd sunday March, 1st Sunday Nov
        c = calendar.Calendar(firstweekday=calendar.SUNDAY)
        march_cal = c.monthdatescalendar(article_date.year, 3)
        dst_start = [
                day for week in march_cal for day in week
                if day.weekday() == calendar.SUNDAY and day.month == 3
        ][1]
        nov_cal = c.monthdatescalendar(article_date.year, 11)
        dst_end = [day for week in nov_cal for day in week
                if day.weekday() == calendar.SUNDAY and day.month == 11
        ][0]
        dst_start = datetime(dst_start.year, dst_start.month, dst_start.day, 2)
        dst_end = datetime(dst_end.year, dst_end.month, dst_end.day, 1)
        if dst_start > article_date > dst_end:
            shift = timedelta(hours=4)
        else:
            shift = timedelta(hours=5)
        return float((article_date + shift - datetime.utcfromtimestamp(0)).total_seconds())

    def parse_date_str(self, date_str):
        parsed = None
        for date_format in self.date_formats:
            try:
                parsed = datetime.strptime(date_str[0:-4], date_format)
                break
            except Exception:
                pass
        return parsed

    def get_article_list(self):
        return self._article_list

    def get_ordered_article_feed(self):
        output = OrderedDict()
        for article in self._article_list:
            article_date = article[3]
            article_dict = {'title': article[0], 'url': article[1],
                            'description': "{0}: {1}".format(self.get_name(), article[2]),
                            'author': self.get_name() + ": " + article[3].strftime(output_date_format),
                            'date': self.get_name() + ": " + article[3].strftime(output_date_format),
                            'timestamp': article[4]}
            output[article_date] = article_dict
        return OrderedDict(sorted(output.items(), key=lambda t: t[0], reverse=True))

    def get_name(self):
        return self._name


class BloombergContributors(BasicNewsRecipe):
    title = u'Bloomberg, Editorial Contributors'
    description = 'Articles from Bloomberg.com contributors'
    __author__ = 'Dale Furrow'
    xpath_contributor_list = """//li[contains(@class, 'item_2zsS8')]/a"""
    language = 'en'
    no_stylesheets = True
    remove_attributes = ['style', 'xmlns']
    keep_only_tags = [dict(name='article', attrs={'data-type': 'article'})]
    # note space...
    remove_tags = [
        dict(name='div', attrs={'class': ['share-article-button ', 'text-to-speech']})]
    oldest_article = 7.0
    ignore_duplicate_articles = {'url'}
    recursions = 0
    category = 'news, USA, world, economy, politics'
    language = 'en'

    def get_contributors_list(self):
        page_doc = get_article_parsed(contributors_url)
        els = page_doc.xpath(self.xpath_contributor_list)
        contributor_list = []
        for el in els:
            name = el.find("span").text.strip()  # name
            contibutor_items = el.attrib['href'].split('/')
            contributor = BloombergContributor(
                name, contibutor_items[4], contibutor_items[3])
            contributor_list.append(contributor)
        for contributor in contributor_list:
            contributor.populate_article_list()
        return contributor_list

    def postprocess_html(self, soup, first_fetch):
        '''
        :param soup: A `BeautifulSoup
        <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>
        `_  instance containing the downloaded :term:`HTML`.
        :param first_fetch: True if this is the first page of an article.
        Remember: BeautifulSoup3! Interface is much different than bs4
        '''
        time_eles = soup.findAll("time", {"class": "article-timestamp"})
        if len(time_eles) > 0:
            time_stamp = time_eles[0].get('datetime')
            try:
                parsed_time = datetime.strptime(time_stamp, "%Y-%m-%dT%H:%M:%S.%fZ")\
                    .strftime("%B %d, %Y %I:%M %p") + " UTC"
            except:
                parsed_time = time_stamp
            insert_tag = Tag(soup, "p", [("class", "user-inserted")])
            insert_tag.insert(0, parsed_time)
            soup.time.replaceWith(insert_tag)

        return soup

    def parse_index(self):
        self.timefmt = ' [%a, %d %b, %Y]'
        self.log('starting parse_index: {0}'.format(
            datetime.now().strftime("%B %d, %Y %I:%M %p")))
        feeds = []
        feed_dict = OrderedDict()
        contributor_list = self.get_contributors_list()
        self.log("Found {0:d} contibutors on main page".format(
            len(contributor_list)))
        for contributor in contributor_list:
            articles = contributor.get_ordered_article_feed()
            feed_dict.update(articles)
        feed_dict = OrderedDict(
            sorted(feed_dict.items(), key=lambda t: t[0], reverse=True))
        self.log("Found {0:d} linked articles from contributors".format(
            len(feed_dict)))
        feeds.append(("Columns", list(feed_dict.values())))
        # self.log("Total of {0:d} {1} articles".format(len(article_list), cat))
        self.log('finishing parse_index: {0}'.format(
            datetime.now().strftime("%B %d, %Y %I:%M %p")))
        return feeds
